import sys, os, re

from PyQt6.QtCore import Qt, QDir, QModelIndex
from PyQt6.QtGui import QIcon, QFileSystemModel, QStandardItemModel, QStandardItem, QPixmap, QKeySequence

# PyQt6
from ui.DocAnalyzer_ui import Ui_MainWindow
from PyQt6.QtWidgets import (QApplication, QMainWindow, QMessageBox,
                             QAbstractItemView, QMdiSubWindow, QPlainTextEdit, QLabel, QFileDialog, QWidget)
# Word
from docx import Document
# TTS
import pyttsx3
# 分词
from zhon.hanzi import punctuation
import jieba
# 词云
from wordcloud import WordCloud
# 爬取信息
from bs4 import BeautifulSoup
import json
# 文字识别
import pytesseract
from PIL import Image


class DocAnalyzer(QMainWindow, Ui_MainWindow):

    """
    这是一个基于 PyQt6 实现的文档可视化分析软件
    可对文档进行朗读、分词、生成词云，
    另外还能爬取网页心中的主题链接、识别图片中的文字。
    """

    def __init__(self):
        super(DocAnalyzer, self).__init__()
        self.setupUi(self)
        # 当前文件
        self.curPath = ''
        self.curFile = ''
        # 分析结果
        self.resText = ''
        # 初始化 UI
        self.setWindowIcon(QIcon('image/docanalyzer.jpg'))
        self.setWindowTitle('我的文档')
        self.setWindowFlag(Qt.WindowType.MSWindowsFixedSizeDialogHint)
        self.initMDI()
        self.initAction()
        self.initDocumentNevigate()
        self.initDocumentView()

    def initMDI(self):
        """
        初始化 MDI
        """
        self.mdiArea.subWindowActivated.connect(self.updateMenuBar)
        pass

    def initAction(self):
        """
        初始化 QAction
        """
        # 文件
        self.actSave.triggered.connect(self.save)
        self.actQuit.triggered.connect(self.close)
        # 窗口
        self.actClose.triggered.connect(self.mdiArea.closeActiveSubWindow)
        self.actCloseAll.triggered.connect(self.mdiArea.closeAllSubWindows)
        self.actTile.triggered.connect(self.mdiArea.tileSubWindows)
        self.actCasCade.triggered.connect(self.mdiArea.cascadeSubWindows)
        self.actPrev.triggered.connect(self.mdiArea.activatePreviousSubWindow)
        self.actNext.triggered.connect(self.mdiArea.activateNextSubWindow)
        # 分析
        self.actSpeak.setIcon(QIcon('image/speak.jpg'))
        self.actSpeak.setShortcut('Ctrl+R')
        self.actSpeak.setEnabled(False)
        self.actSpeak.triggered.connect(self.readSpeak)
        self.actWord.setShortcut('Ctrl+W')
        self.actWord.setEnabled(False)
        self.actWord.triggered.connect(self.cutWord)
        self.actCloud.setIcon(QIcon('image/cloud.jpg'))
        self.actCloud.setEnabled(False)
        self.actCloud.triggered.connect(self.generCloud)
        self.actCrawl.setIcon(QIcon('image/crawl.jpg'))
        self.actCrawl.setEnabled(False)
        self.actCrawl.triggered.connect(self.titleCrawl)
        self.actRecog.setEnabled(False)
        self.actRecog.triggered.connect(self.textRecog)
        # 关于
        self.actAbout.triggered.connect(lambda: QMessageBox.about(self, '关于', self.__doc__.join('\r\n')))
        self.actAboutPyQt.triggered.connect(lambda: QMessageBox.aboutQt(self, '关于 PyQt'))

    def initDocumentNevigate(self):
        """
        初始化 文档导航
        """
        self.dirModel = QFileSystemModel(self)
        self.dirModel.setRootPath('')
        # 隐藏 . 及 ..
        self.dirModel.setFilter(QDir.Filter.AllDirs | QDir.Filter.NoDotAndDotDot)
        self.trvOSDirs.setModel(self.dirModel)
        # 隐藏标题栏
        self.trvOSDirs.setHeaderHidden(True)
        # 仅显示文件树，隐藏其他列
        for col in range(1, 4):
            self.trvOSDirs.setColumnHidden(col, True)
        self.trvOSDirs.doubleClicked[QModelIndex].connect(self.showFiles)

    def initDocumentView(self):
        """
        初始化 文档视图
        当双击某个目录时，分门别类显示支持的文件
        """
        self.fileModel = QStandardItemModel(self)
        self.trvDocFiles.setModel(self.fileModel)
        self.trvDocFiles.setHeaderHidden(True)
        # 不可编辑 双击时不会触发文件重命名
        self.trvDocFiles.setEditTriggers(QAbstractItemView.EditTrigger.NoEditTriggers)
        self.trvDocFiles.doubleClicked[QModelIndex].connect(self.showContent)
        # 初始化文件列表
        self.initFileModel()
        pass

    def showFiles(self, index):
        """
        打开一个目录
        """
        self.initFileModel()
        self.curPath = self.dirModel.filePath(index)

        files = os.listdir(self.curPath)
        for i in range(len(files)):
            fullpath = os.path.join(self.curPath, files[i])
            if not os.path.isdir(fullpath):
                fileItem = QStandardItem(files[i])
                fileType = files[i].split('.')[-1]
                if fileType == 'txt':
                    fileItem.setIcon(QIcon('image/text.jpg'))
                    self.textFiles.appendRow(fileItem)
                elif fileType == 'docx':
                    fileItem.setIcon(QIcon('image/word.jpg'))
                    self.wordFiles.appendRow(fileItem)
                elif fileType == 'htm' or fileType == 'html':
                    fileItem.setIcon(QIcon('image/html.jpg'))
                    self.htmlFiles.appendRow(fileItem)
                elif fileType in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'ico']:
                    fileItem.setIcon(QIcon('image/pic.jpg'))
                    self.picFiles.appendRow(fileItem)
        self.trvDocFiles.expandAll()
        self.updateStatus()

    def showContent(self, index: QModelIndex):
        """
        打开一个文件
        """
        self.curFile = self.fileModel.itemData(index)[0]
        self.updateStatus()
        fullpath = os.path.join(self.curPath, self.curFile)
        filetype = fullpath.split('.')[-1]
        if filetype in {'txt', 'docx', 'htm', 'html'}:
            # 文档
            content = ''
            if filetype in {'txt', 'htm', 'html'}:
                # 纯文本
                with open(fullpath, 'r', encoding='utf-8') as f:
                    content = f.read()
            elif filetype == 'docx':
                # Word 文档
                doc = Document(fullpath)
                for p in doc.paragraphs:
                    content += p.text
                    content += '\r\n'
            self._display_result(fullpath, content)
        elif filetype in {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'ico'}:
            # 图片
            self._display_result(fullpath, QPixmap(fullpath))

    def initFileModel(self):
        """
        初始化文件列表
        """
        self.fileModel.clear()
        self.textFiles = QStandardItem('文本文件')
        self.fileModel.appendRow(self.textFiles)
        self.wordFiles = QStandardItem('Word 文档')
        self.fileModel.appendRow(self.wordFiles)
        self.htmlFiles = QStandardItem('网页')
        self.fileModel.appendRow(self.htmlFiles)
        self.picFiles = QStandardItem('图片')
        self.fileModel.appendRow(self.picFiles)
        self.curFile = ''
        self.updateStatus()

    def updateStatus(self):
        """
        更新状态栏
        """
        self.statusbar.showMessage(self.curPath + '/' + self.curFile)

    def updateMenuBar(self):
        """
        控制菜单可用状态
        """
        self.actSpeak.setEnabled(False)
        self.actWord.setEnabled(False)
        self.actCloud.setEnabled(False)
        self.actCrawl.setEnabled(False)
        self.actRecog.setEnabled(False)

        filetype = self.curFile.split('.')[-1]
        if filetype in {'txt', 'docx'}:
            self.actSpeak.setEnabled(True)
            self.actWord.setEnabled(True)
            self.actCloud.setEnabled(True)
        elif filetype in {'html', 'htm'}:
            self.actCrawl.setEnabled(True)
        elif filetype in {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'ico'}:
            self.actRecog.setEnabled(True)

    def readSpeak(self):
        """
        朗读
        """
        content = self.mdiArea.currentSubWindow().widget().toPlainText()
        engine = pyttsx3.init()
        engine.say(content)
        engine.runAndWait()

    def cutWord(self):
        """
        分词
        """
        content = self.mdiArea.currentSubWindow().widget().toPlainText()
        # 去标点
        content = re.sub('[%s]+' % punctuation, '', content)
        # 去换行
        content = re.sub('[%s]+' % '\r\n', '', content)
        # 分词 - 加载词典
        jieba.load_userdict('dict.txt')
        # 分词 - 使用 jieba 全模式分词
        words = jieba.lcut(content)
        self.resText = str(words)
        self._display_result('分词结果')

    def generCloud(self):
        """
        生成词云
        """
        # 分词
        content = self.mdiArea.currentSubWindow().widget().toPlainText()
        content = re.sub('[%s]+' % punctuation, '', content)
        content = re.sub('[%s]+' % '\r\n', '', content)
        jieba.load_userdict('dict.txt')
        words = jieba.lcut(content)
        # 词云
        cloud = WordCloud(font_path='simsun.ttc').generate(' '.join(words))
        path = self.curPath + '/词频云图.png'
        cloud.to_file(path)
        self._display_result('词频云图', QPixmap(path))

    def titleCrawl(self):
        """
        提取信息
        """
        content = self.mdiArea.currentSubWindow().widget().toPlainText()
        soup = BeautifulSoup(content, 'html.parser')
        links = []
        for div in soup.find_all('div',
                                 { 'data-tools': re.compile('title') },
                                 { 'data-tools': re.compile('url') }):
            data = div.attrs['data-tools']
            data = str(data).replace("'", '"')
            d = json.loads(data)
            links.append(d['title'] + ': ' + d['url'])
        self.resText = ''
        count = 1
        for i in links:
            self.resText += '[{:^3}]{}'.format(count, i) + '\r\n'
            count += 1
        self._display_result('主题链接')

    def textRecog(self):
        """
        识别文字
        """
        path = self.curPath + '/' + self.curFile
        image = Image.open(path)
        self.resText = pytesseract.image_to_string(image, lang='chi_sim')
        self._display_result('识别文字')

    def save(self):
        widget = self.mdiArea.currentSubWindow().widget()
        if isinstance(widget, QPlainTextEdit):
            content = widget.toPlainText()
            file, _ = QFileDialog.getSaveFileName(self, filter='*.txt')
            if file is not None:
                with open(file, 'w', encoding='utf-8') as f:
                    f.write(content)
        elif isinstance(widget, QLabel):
            file, _ = QFileDialog.getSaveFileName(self, filter='*.png')
            if file is not None:
                pic = widget.pixmap()
                pic.save(file)

    def _display_result(self, title: str, message: str | QPixmap | None = None):
        """
        在 MDI 中打开一个子窗口
        :param title: 窗口标题
        :param message: 展示内容
        """
        subwindow = QMdiSubWindow(self)
        subwindow.setWindowTitle(title)
        # 添加控件
        if message is None:
            widget = QPlainTextEdit(self.resText, subwindow)
            subwindow.setWidget(widget)
        elif isinstance(message, str):
            widget = QPlainTextEdit(message, subwindow)
            subwindow.setWidget(widget)
        elif isinstance(message, QPixmap):
            widget = QLabel(subwindow)
            widget.setPixmap(message)
            subwindow.setWidget(widget)
        self.mdiArea.addSubWindow(subwindow)
        subwindow.show()


if __name__ == '__main__':
    app = QApplication(sys.argv)
    window = DocAnalyzer()
    window.show()
    sys.exit(app.exec())
